Disponível em: http://insideairbnb.com/get-the-data.html Referencia para seguir: https://www.kaggle.com/josipdomazet/mining-nyc-airbnb-data-using-r
Download:
Importa o dataset
## -- Attaching packages ------------------------------------------------------------------------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Parsed with column specification:
## cols(
## .default = col_character(),
## id = col_double(),
## scrape_id = col_double(),
## last_scraped = col_date(format = ""),
## thumbnail_url = col_logical(),
## medium_url = col_logical(),
## xl_picture_url = col_logical(),
## host_id = col_double(),
## host_since = col_date(format = ""),
## host_is_superhost = col_logical(),
## host_listings_count = col_double(),
## host_total_listings_count = col_double(),
## host_has_profile_pic = col_logical(),
## host_identity_verified = col_logical(),
## neighbourhood_group_cleansed = col_logical(),
## latitude = col_double(),
## longitude = col_double(),
## is_location_exact = col_logical(),
## accommodates = col_double(),
## bathrooms = col_double(),
## bedrooms = col_double()
## # ... with 40 more columns
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
## row col expected actual file
## 1745 license 1/0/T/F/TRUE/FALSE +1512-6670366 <connection>
## 28567 license 1/0/T/F/TRUE/FALSE 56131/AL <connection>
## 34253 license 1/0/T/F/TRUE/FALSE 05.557.336/0001-70 <connection>
## # A tibble: 35,451 x 23
## name host_name host_total_list~ calculated_host~ neighbourhood_c~
## <chr> <fct> <dbl> <dbl> <fct>
## 1 Very~ Matthias 2 1 Copacabana
## 2 Beau~ Viviane 3 3 Copacabana
## 3 NICE~ Renata 1 1 Ipanema
## 4 Cosy~ Patricia 1 1 Ipanema
## 5 COPA~ Patricia~ 1 1 Copacabana
## 6 Copa~ Seba 1 1 Copacabana
## 7 Beac~ Alex 7 6 Ipanema
## 8 Rio ~ Vana 2 1 Copacabana
## 9 4bed~ Marcio 7 5 Copacabana
## 10 HUma~ Marcio 7 5 Humaitá
## # ... with 35,441 more rows, and 18 more variables: latitude <dbl>,
## # longitude <dbl>, property_type <fct>, room_type <fct>, price <dbl>,
## # accommodates <dbl>, bedrooms <dbl>, minimum_nights <dbl>,
## # maximum_nights <dbl>, availability_365 <dbl>, number_of_reviews <dbl>,
## # review_scores_rating <dbl>, review_scores_accuracy <dbl>,
## # review_scores_location <dbl>, review_scores_value <dbl>,
## # cancellation_policy <fct>, require_guest_profile_picture <lgl>,
## # require_guest_phone_verification <lgl>
airbnb %>% summary()
## name host_name host_total_listings_count
## Length:35451 Daniel : 432 Min. : 0.000
## Class :character Ricardo: 322 1st Qu.: 1.000
## Mode :character Maria : 315 Median : 1.000
## Marcelo: 311 Mean : 9.691
## Mario : 309 3rd Qu.: 3.000
## (Other):33702 Max. :776.000
## NA's : 60 NA's :60
## calculated_host_listings_count neighbourhood_cleansed
## Min. : 1.000 Copacabana : 8825
## 1st Qu.: 1.000 Barra da Tijuca : 3908
## Median : 1.000 Ipanema : 2970
## Mean : 7.885 Jacarepaguá : 1917
## 3rd Qu.: 2.000 Botafogo : 1767
## Max. :265.000 Recreio dos Bandeirantes: 1750
## (Other) :14314
## latitude longitude property_type
## Min. :-23.07 Min. :-43.74 Apartment :27023
## 1st Qu.:-22.98 1st Qu.:-43.32 House : 3709
## Median :-22.97 Median :-43.20 Condominium : 1618
## Mean :-22.96 Mean :-43.25 Serviced apartment: 903
## 3rd Qu.:-22.94 3rd Qu.:-43.19 Loft : 653
## Max. :-22.75 Max. :-43.10 Bed and breakfast : 281
## (Other) : 1264
## room_type price accommodates
## Entire home/apt:25006 Min. : 0.0 Min. : 1.000
## Private room : 9586 1st Qu.: 150.0 1st Qu.: 2.000
## Shared room : 859 Median : 281.0 Median : 4.000
## Mean : 622.2 Mean : 4.175
## 3rd Qu.: 599.0 3rd Qu.: 5.000
## Max. :40000.0 Max. :160.000
##
## bedrooms minimum_nights maximum_nights availability_365
## Min. : 0.000 Min. : 1.000 Min. : 1 Min. : 0.0
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 30 1st Qu.: 0.0
## Median : 1.000 Median : 2.000 Median : 1125 Median :179.0
## Mean : 1.637 Mean : 4.736 Mean : 1527 Mean :190.1
## 3rd Qu.: 2.000 3rd Qu.: 4.000 3rd Qu.: 1125 3rd Qu.:362.0
## Max. :22.000 Max. :1123.000 Max. :10000000 Max. :365.0
## NA's :23
## number_of_reviews review_scores_rating review_scores_accuracy
## Min. : 0.000 Min. : 20.00 Min. : 2.000
## 1st Qu.: 0.000 1st Qu.: 93.00 1st Qu.:10.000
## Median : 1.000 Median : 98.00 Median :10.000
## Mean : 7.952 Mean : 94.35 Mean : 9.617
## 3rd Qu.: 5.000 3rd Qu.:100.00 3rd Qu.:10.000
## Max. :350.000 Max. :100.00 Max. :10.000
## NA's :17455 NA's :17473
## review_scores_location review_scores_value
## Min. : 2.000 Min. : 2.000
## 1st Qu.:10.000 1st Qu.: 9.000
## Median :10.000 Median :10.000
## Mean : 9.723 Mean : 9.312
## 3rd Qu.:10.000 3rd Qu.:10.000
## Max. :10.000 Max. :10.000
## NA's :17471 NA's :17472
## cancellation_policy require_guest_profile_picture
## flexible :15552 Mode :logical
## moderate : 5757 FALSE:34873
## strict : 2 TRUE :578
## strict_14_with_grace_period:13589
## super_strict_30 : 163
## super_strict_60 : 388
##
## require_guest_phone_verification
## Mode :logical
## FALSE:34850
## TRUE :601
##
##
##
##
glimpse(airbnb)
## Observations: 35,451
## Variables: 23
## $ name <chr> "Very Nice 2Br - Copacabana -...
## $ host_name <fct> Matthias, Viviane, Renata, Pa...
## $ host_total_listings_count <dbl> 2, 3, 1, 1, 1, 1, 7, 2, 7, 7,...
## $ calculated_host_listings_count <dbl> 1, 3, 1, 1, 1, 1, 6, 1, 5, 5,...
## $ neighbourhood_cleansed <fct> Copacabana, Copacabana, Ipane...
## $ latitude <dbl> -22.96592, -22.97712, -22.983...
## $ longitude <dbl> -43.17896, -43.19045, -43.214...
## $ property_type <fct> Condominium, Apartment, Apart...
## $ room_type <fct> Entire home/apt, Entire home/...
## $ price <dbl> 296, 161, 243, 337, 221, 150,...
## $ accommodates <dbl> 5, 3, 3, 3, 2, 2, 13, 1, 11, ...
## $ bedrooms <dbl> 2, 1, 1, 1, 1, 1, 6, 1, 4, 1,...
## $ minimum_nights <dbl> 4, 4, 2, 2, 3, 2, 2, 3, 4, 5,...
## $ maximum_nights <dbl> 30, 30, 1125, 89, 28, 30, 89,...
## $ availability_365 <dbl> 332, 352, 125, 122, 145, 89, ...
## $ number_of_reviews <dbl> 233, 232, 260, 160, 303, 1, 5...
## $ review_scores_rating <dbl> 93, 94, 96, 94, 98, NA, 91, 9...
## $ review_scores_accuracy <dbl> 9, 9, 10, 10, 10, NA, 9, 10, ...
## $ review_scores_location <dbl> 10, 10, 10, 10, 10, NA, 10, 1...
## $ review_scores_value <dbl> 9, 9, 10, 9, 10, NA, 9, 10, 6...
## $ cancellation_policy <fct> strict_14_with_grace_period, ...
## $ require_guest_profile_picture <lgl> FALSE, TRUE, FALSE, TRUE, FAL...
## $ require_guest_phone_verification <lgl> FALSE, TRUE, FALSE, TRUE, TRU...
# Remove sem reviews
airbnb <- airbnb %>% filter(number_of_reviews != 0)
# Remove preço 0
airbnb <- airbnb %>% filter(price != 0)
# Remove sem review_scores_accuracy
airbnb <- airbnb %>% drop_na(review_scores_accuracy)
airbnb <- airbnb %>% drop_na(review_scores_value)
airbnb <- airbnb %>% drop_na(review_scores_rating)
airbnb <- airbnb %>% drop_na(review_scores_location)
airbnb <- airbnb %>% drop_na(bedrooms)
missing_airbnb <- summarise_all(airbnb %>% drop_na(review_scores_accuracy), ~sum(is.na(.)))
missing_airbnb <- gather(missing_airbnb, key = "variables", value = "missing")
missing_airbnb %>% filter(missing > 0)
## # A tibble: 3 x 2
## variables missing
## <chr> <int>
## 1 name 2
## 2 host_name 40
## 3 host_total_listings_count 40
n_bairros <- 7
bairros <- airbnb %>%
group_by(neighbourhood_cleansed) %>%
tally(sort=TRUE) %>%
group_by(bairro = factor(c(
as.character(neighbourhood_cleansed[1:n_bairros]), rep("Outros", n() - n_bairros)),
levels = c(as.character(neighbourhood_cleansed[1:n_bairros]), "Outros"))) %>%
tally(n)
bairros %>%
ggplot(aes(bairro, n, fill=bairro)) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
theme(legend.position = "none") +
xlab("Bairro") +
ylab("Frquência")
ggplot(airbnb, aes(x=room_type, fill=room_type)) +
geom_bar() +
geom_text(stat='count', aes(label=..count..), vjust=-0.4, size=3.5)
n_tipos <- 6
tipos_propriedade <- airbnb %>%
group_by(property_type) %>%
tally(sort=TRUE) %>%
group_by(tipo_propriedade = factor(c(
as.character(property_type[1:n_tipos]), rep("Outros", n() - n_tipos)),
levels = c(as.character(property_type[1:n_tipos]), "Outros"))) %>%
tally(n)
tipos_propriedade %>%
ggplot(aes(tipo_propriedade, n, fill=tipo_propriedade)) +
geom_bar(stat="identity", legend=NULL) +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
xlab("Tipo de propriedade") +
ylab("Frequência") +
theme(axis.text = element_blank())
## Warning: Ignoring unknown parameters: legend
politicas_cancelamento <- airbnb %>%
group_by(cancellation_policy) %>%
tally(sort=TRUE)
politicas_cancelamento
## # A tibble: 6 x 2
## cancellation_policy n
## <fct> <int>
## 1 strict_14_with_grace_period 8722
## 2 flexible 4938
## 3 moderate 3971
## 4 super_strict_60 223
## 5 super_strict_30 90
## 6 strict 2
politicas_cancelamento %>%
ggplot(aes(x=reorder(cancellation_policy, -n), y=n, fill=reorder(cancellation_policy, -n))) +
geom_bar(stat="identity") +
geom_text(aes(label=n), vjust=-0.4, size=3.5) +
theme(axis.text.x = element_blank()) +
xlab("Política de cancelamento") +
ylab("Frequência") +
labs(fill="Política de cancelamento")
blank_theme <- theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x=element_blank(),
panel.border = element_blank(),
panel.grid=element_blank(),
axis.ticks = element_blank(),
plot.title=element_text(size=14, face="bold")
)
airbnb %>%
ggplot(aes(x="", fill=require_guest_profile_picture)) +
geom_bar(width=1) +
coord_polar("y", start=0) +
blank_theme +
geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
labs(fill="") +
ggtitle("Requer foto de perfil do hóspede")
airbnb %>%
ggplot(aes(x="", fill=require_guest_phone_verification)) +
geom_bar(width=1) +
coord_polar("y", start=0) +
blank_theme +
geom_text(stat='count',aes(label=..count..), position = position_stack(vjust = 0.5), color="white") +
labs(fill="") +
ggtitle("Requer que o hóspede tenha telefone verificado")
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30) +
#geom_density(alpha = 0.2, fill = "purple") +
ggtitle("Distribução de preço",
subtitle = "A distribuição é muito inclinada") +
theme(axis.title = element_text(), axis.title.x = element_text())
#geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3)
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30) +
ggtitle("Distribuição transformada do preço",
subtitle = expression("Com uma transformação" ~'log'[10] ~ "do eixo x")) +
#theme(axis.title = element_text(), axis.title.x = element_text()) +
#geom_vline(xintercept = round(mean(airbnb$price), 2), size = 2, linetype = 3) +
scale_x_log10()
#annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(airbnb$price), 2), "$")),
# color = "#32CD32", size = 8)
ggplot(airbnb, aes(price, fill=room_type)) +
geom_histogram(bins = 30, aes(y = ..density..), show.legend = FALSE) +
facet_wrap(~room_type) +
scale_x_log10()
ggplot(airbnb, aes(x = room_type, y = price)) +
geom_boxplot(aes(fill = room_type)) + scale_y_log10() +
xlab("Tipo de quarto") +
ylab("Preço") +
ggtitle("Boxplots of price by room type",
subtitle = "Entire homes and apartments have the highest avg price") +
geom_hline(yintercept = mean(airbnb$price), color = "purple", linetype = 2) +
theme(legend.position = "none")
library(corrplot)
## corrplot 0.84 loaded
airbnb$log_price = log(airbnb$price)
airbnb_cor <- airbnb[, sapply(airbnb, is.numeric)]
airbnb_cor <- airbnb_cor[complete.cases(airbnb_cor), ]
correlation_matrix <- cor(airbnb_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")
pal <- colorFactor(palette = c("red", "green", "blue", "purple", "yellow"), domain = airbnb$room_type)
leaflet(data = airbnb) %>%
addProviderTiles(providers$CartoDB.DarkMatterNoLabels) %>%
addCircleMarkers(~longitude,
~latitude,
color=~pal(room_type),
weight = 1,
radius=1,
fillOpacity = 0.1,
opacity = 1,
label = paste("Name:", airbnb$name)) %>% addLegend("bottomright", pal = pal, values = ~room_type,
title = "Room types",
opacity = 1)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_ly(airbnb, x = ~longitude, y = ~latitude, z = ~price, color = ~room_type)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode